拜登正式就任美國第46任總統後,Twitter也將總統官方帳號POTUS轉交拜登團隊。與四年前交接不同的是,拜登並未接收先前川普的POTUS粉絲基礎,追蹤者將從零開始。多數美國媒體聲稱過去拜登的個人推特經營主要面向美國大眾推廣政策理念,相反的川普帳號只跟自己的支持者對話。如今,拜登的個人推特帳號主要扮演轉推POTUS推文的角色,而時下未見有拜登POTUS的詳細分析。拜登POTUS在公共關係經營上真的還是走傳統、建制路線嗎?本專案想要探討拜登POTUS是僅僅將twitter當作另一個宣傳政令跟政府講話的媒體平台,還是存在更簡結有力的政策想法以及情緒宣傳?
本專案預設tokenize去掉停詞之後取得的關鍵字對於推文含意的辨識度還是有權重差異的,有些特別具象化單詞會被推論為具有特定政策意涵或與特定政策事件相關。而其他較模糊的政治相關用語或一般字詞則被視為不具有明確政策資訊內涵。因此,本專案簡單假設如果POTUS只是一個單純宣傳政令或講話的帳號,應該大量含有上述辨識度高的具象化單字以指涉特定議題或事件。或者,在喜歡數或轉推數較高的推文中,要可以觀察到上述情形。
library(dplyr)
library(tibble)
library(ggplot2)
library(stringr)
library(tidyr)
library(lubridate)
library(scales)
library(tidytext)
library(textdata)
library(cowplot)
#Load the dataset obtained from TwitteR package
load(file = "biden_tweets_df.rda")
load(file = "AllJoeBidenTweets.rda")
# Mutate source column and select other meaningful columns
df <- biden_tweets_df %>%
mutate(source = sapply(biden_tweets_df$statusSource , function(x) {
if (str_detect(x, pattern = "Twitter Web App")) return("Twitter_Web_App")
else if (str_detect(x, pattern = "Periscope")) return("Periscope")
else if (str_detect(x, pattern = "Twitter Media Studio")) return("Twitter_Media_Studio")
else if (str_detect(x, pattern = "Sprinklr")) return("Sprinklr")
else return("The_White_House")
})) %>%
select(id, source, text, created, retweetCount, favoriteCount) %>%
mutate(hour = hour(with_tz(created, "EST")))reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words <- df %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]")) %>%
arrange(desc(favoriteCount))ggplot(data = df) +
geom_col(mapping = aes(x = created, y = retweetCount, color = source, fill = source))ggplot(data = df) +
geom_col(mapping = aes(x = created, y = favoriteCount, color = source, fill = source))tweeting_hour <- df %>%
group_by(hour) %>%
summarise(number=n(),
sum_favoriteCount = sum(favoriteCount),
sum_retweetCount = sum(retweetCount),
mean_favoriteCount = mean(favoriteCount),
mean_retweetCount = mean(retweetCount)) %>%
arrange(desc(number))
tweeting_hour_20 <- tweet_words %>%
select(hour, word, retweetCount, favoriteCount) %>%
inner_join(get_sentiments("nrc")) %>%
arrange(desc(favoriteCount)) %>%
top_n(20, favoriteCount)
p1 <- df %>%
count(hour = hour(with_tz(created, "EST"))) %>%
mutate(percent = n / sum(n)) %>%
ggplot(mapping = aes(hour, percent)) +
geom_line() +
scale_y_continuous(labels = percent_format()) +
labs(x = "Hour of a day", y = "% of tweets")
p2 <- ggplot(tweeting_hour) +
geom_line(aes(hour, sum_favoriteCount), color="blue") +
labs(x = "Hour of a day")
p3 <- ggplot(tweeting_hour) +
geom_line(aes(hour, mean_favoriteCount), color="red") +
labs(x = "Hour of a day")
p4 <- ggplot(tweeting_hour_20) +
geom_point(aes(x = hour, y = word)) +
labs(x = "Hour of a day")
plot_grid(p1, p2, p3, p4, ncol = 2)# Whether a Picture/Link is likely to be included in a Tweet
picture <- df %>%
filter(!str_detect(text, '^"')) %>%
count(source, picture = ifelse(str_detect(text, "t.co"),
"Picture/link", "No picture/link"))
picture## # A tibble: 7 x 3
## source picture n
## <chr> <chr> <int>
## 1 Periscope Picture/link 32
## 2 Sprinklr No picture/link 1
## 3 Sprinklr Picture/link 4
## 4 The_White_House No picture/link 44
## 5 The_White_House Picture/link 541
## 6 Twitter_Media_Studio Picture/link 34
## 7 Twitter_Web_App Picture/link 4
df %>%
filter(!str_detect(text, '^"')) %>%
count(picture = ifelse(str_detect(text, "t.co"),
"Picture/link", "No picture/link")) %>%
ggplot() +
geom_bar(aes(picture, n), stat = "identity", width = 0.5) +
labs(x = "", y = "count")tweet_words %>%
count(word) %>%
mutate(word = reorder(word, n)) %>%
top_n(20, n) %>%
ggplot() +
geom_bar(aes(x = word, y = n), stat = "identity") +
ylab("frequency of occurrence") +
coord_flip()# Word Weighting of different sources (Video Streaming App and The White House)
WhiteHouse_periscope_ratios <- tweet_words %>%
count(word, source) %>%
filter(sum(n) >= 5) %>%
spread(source, n, fill = 0) %>%
ungroup() %>%
mutate_each(funs((. + 1) / sum(. + 1)), -word) %>%
mutate(logratio = log2(The_White_House / Periscope)) %>%
arrange(desc(logratio))
WhiteHouse_periscope_ratios %>%
group_by(logratio > 0) %>%
top_n(8, abs(logratio)) %>%
ungroup() %>%
mutate(word = reorder(word, logratio)) %>%
ggplot(aes(word, logratio, fill = logratio < 0)) +
geom_bar(stat = "identity", width = 0.5) +
coord_fixed(ratio = 1000000) +
coord_flip() +
ylab("The White House / Periscope log ratio") +
scale_fill_manual(name = "", labels = c("The White House", "Periscope"),
values = c("red", "lightblue"))sentiment_overview <- tweet_words %>%
inner_join(get_sentiments("nrc")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE)
sentiment_overview## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 positive 761
## 2 trust 536
## 3 anticipation 418
## 4 negative 326
## 5 joy 259
## 6 fear 185
## 7 sadness 147
## 8 anger 133
## 9 surprise 131
## 10 disgust 64
ggplot(sentiment_overview) +
geom_bar(aes(x = reorder(sentiment, n), y = n), stat = "identity") +
coord_flip()sources <- tweet_words %>%
group_by(source) %>%
mutate(total_words = n()) %>%
ungroup() %>%
distinct(id, source, total_words)
by_source_sentiment <- tweet_words %>%
inner_join(get_sentiments("nrc"), by = "word") %>%
count(sentiment, id) %>%
ungroup() %>%
complete(sentiment, id, fill = list(n = 0)) %>%
inner_join(sources) %>%
group_by(source, sentiment, total_words) %>%
summarize(words = sum(n)) %>%
arrange(desc(total_words)) %>%
ungroup()
by_source_sentiment## # A tibble: 50 x 4
## source sentiment total_words words
## <chr> <chr> <int> <dbl>
## 1 The_White_House anger 4442 123
## 2 The_White_House anticipation 4442 393
## 3 The_White_House disgust 4442 57
## 4 The_White_House fear 4442 171
## 5 The_White_House joy 4442 247
## 6 The_White_House negative 4442 304
## 7 The_White_House positive 4442 692
## 8 The_White_House sadness 4442 136
## 9 The_White_House surprise 4442 123
## 10 The_White_House trust 4442 494
## # ... with 40 more rows
nrc_word_counts <- tweet_words %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
nrc_word_counts## # A tibble: 960 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 plan anticipation 87
## 2 rescue anticipation 52
## 3 rescue joy 52
## 4 rescue positive 52
## 5 rescue surprise 52
## 6 rescue trust 52
## 7 time anticipation 47
## 8 president positive 39
## 9 president trust 39
## 10 nation trust 30
## # ... with 950 more rows
nrc_word_counts %>%
group_by(sentiment) %>%
top_n(8, n) %>%
arrange(desc(n)) %>%
ungroup() %>%
ggplot() +
geom_histogram(aes(x = reorder(word, n), y = n), stat = "identity", width = 0.75) +
coord_flip() +
facet_wrap(vars(sentiment), scales = "free") +
theme(axis.text.y = element_text(size = rel(0.75))) +
labs(x = "Word", y = "", title = "Frequency of Occurrence by Sentiment")# Preprocessing
favorite <- df%>%
arrange(desc(favoriteCount)) %>%
top_n(50)
# Collecting Tweets Words
reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
favorite_words <- favorite %>%
filter(!str_detect(text, '^"')) %>%
mutate(text = str_replace_all(text, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]")) %>%
arrange(desc(favoriteCount))
# View the difference between whole POTUS tweets and top 50 favorite tweets
sentiment_overview_50 <- favorite_words %>%
inner_join(get_sentiments("nrc")) %>%
filter(!is.na(sentiment)) %>%
count(sentiment, sort = TRUE)
sentiment_overview_50## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 positive 49
## 2 trust 45
## 3 negative 26
## 4 anticipation 21
## 5 fear 18
## 6 joy 16
## 7 anger 14
## 8 sadness 13
## 9 surprise 9
## 10 disgust 4
sentiment_overview_50 %>%
ggplot() +
geom_bar(aes(x = reorder(sentiment, n), y = n), stat = "identity") +
coord_flip()nrc_word_counts <- favorite_words %>%
inner_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
nrc_word_counts## # A tibble: 187 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 president positive 4
## 2 president trust 4
## 3 nation trust 3
## 4 cancer anger 2
## 5 cancer disgust 2
## 6 cancer fear 2
## 7 cancer negative 2
## 8 cancer sadness 2
## 9 defense anger 2
## 10 defense anticipation 2
## # ... with 177 more rows
nrc_word_counts %>%
filter(n > 1) %>%
ggplot() +
geom_histogram(aes(x = reorder(word, n), y = n), stat = "identity", width = 0.75) +
coord_flip() +
facet_wrap(vars(sentiment), scales = "free") +
theme(axis.text.y = element_text(size = rel(0.75))) +
labs(x = "Word", y = "", title = "Words with Top Fifty FavoriteCount by Sentiment")reg <- "([^A-Za-z\\d#@']|'(?![A-Za-z\\d#@]))"
tweet_words2 <- JoeBidenTweets %>%
filter(!str_detect(tweet, '^"')) %>%
mutate(text = str_replace_all(tweet, "https://t.co/[A-Za-z\\d]+|&", "")) %>%
unnest_tokens(word, text, token = "regex", pattern = reg) %>%
filter(!word %in% stop_words$word,
str_detect(word, "[a-z]")) %>%
arrange(desc(likes))
sentiment_counts <- tweet_words2 %>%
left_join(get_sentiments("nrc")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
sentiment_counts %>%
group_by(sentiment) %>%
top_n(8, n) %>%
arrange(desc(n)) %>%
ungroup() %>%
ggplot() +
geom_histogram(aes(x = reorder(word, n), y = n), stat = "identity", width = 0.75) +
coord_flip() +
facet_wrap(vars(sentiment), scales = "free") +
theme(axis.text.y = element_text(size = rel(0.75))) +
labs(x = "Word", y = "", title = "Frequency of Occurrence by Sentiment")